In [114]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [101]:
test=pd.read_csv("test.csv")
test.head() #visualising last 10 data
# print g_model.loc[417,"Survived"] #individual visualisation
Out[101]:
In [112]:
mData=pd.read_csv("train.csv")
mData.head()
Out[112]:
In [103]:
mData.info()
print "_________________________________________________"
test.info()
In [113]:
mData.drop(["PassengerId","Name","Ticket"], axis=1)
test.drop(["Name","Ticket"], axis=1)
Out[113]:
In [128]:
sns.factorplot('Embarked','Survived', data=mData,size=4,aspect=3)
#divide screen in 3
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
sns.countplot(x='Embarked', data=mData, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=mData, order=[1,0], ax=axis2)
#Below Is Wow Feature
# group by embarked, and get the mean for survived passengers for each value in Embarked
embark_perc = mData[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)
Out[128]:
In [130]:
# only for test, since there is a missing "Fare" values
test["Fare"].fillna(test["Fare"].median(), inplace=True) #replace missing value by median
# convert from float to int
mData['Fare'] = mData['Fare'].astype(int)
test['Fare'] = test['Fare'].astype(int)
# get fare for survived & didn't survive passengers
fare_not_survived = mData["Fare"][mData["Survived"] == 0]
fare_survived = mData["Fare"][mData["Survived"] == 1]
# get average and std for fare of survived/not survived passengers
avgerage_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()])
std_fare = pd.DataFrame([fare_not_survived.std(), fare_survived.std()])
In [138]:
# plot
mData['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,80))
Out[138]:
In [143]:
avgerage_fare.index.names = std_fare.index.names = ["Survived"]
avgerage_fare
Out[143]:
In [147]:
avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)
Out[147]:
In [150]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')
# get average, std, and number of NaN values in titanic_df
average_age_titanic = mData["Age"].mean()
std_age_titanic = mData["Age"].std()
count_nan_age_titanic = mData["Age"].isnull().sum()
# get average, std, and number of NaN values in test
average_age_test = test["Age"].mean()
std_age_test = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()
# convert from float to int
mData['Age'] = mData['Age'].astype(int)
test['Age'] = test['Age'].astype(int)
# plot original Age values
mData['Age'].hist(bins=70, ax=axis1)
# generate random numbers between (mean - std) & (mean + std) ## WOW
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)
# fill NaN values in Age column with random values generated
mData["Age"][np.isnan(mData["Age"])] = rand_1
test["Age"][np.isnan(test["Age"])] = rand_2
# plot new Age Values
mData['Age'].hist(bins=70, ax=axis2)
Out[150]:
In [151]:
# .... continue with plot Age column
# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(mData, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, mData['Age'].max()))
facet.add_legend()
# average survived passengers by age
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
average_age = mData[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)
Out[151]:
In [152]:
# Family
# Instead of having two columns Parch & SibSp,
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
mData['Family'] = mData['Parch'] + mData['SibSp']
mData['Family'].loc[mData['Family'] > 0] = 1
mData['Family'].loc[mData['Family'] == 0] = 0
test['Family'] = test['Parch'] + test['SibSp']
test['Family'].loc[test['Family'] > 0] = 1
test['Family'].loc[test['Family'] == 0] = 0
# drop Parch & SibSp
mData = mData.drop(['SibSp','Parch'], axis=1)
test = test.drop(['SibSp','Parch'], axis=1)
# plot
fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))
# sns.factorplot('Family',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Family', data=mData, order=[1,0], ax=axis1)
# average of survived for those who had/didn't have any family member
family_perc = mData[["Family", "Survived"]].groupby(['Family'],as_index=False).mean()
sns.barplot(x='Family', y='Survived', data=family_perc, order=[1,0], ax=axis2)
axis1.set_xticklabels(["With Family","Alone"], rotation=0)
Out[152]:
In [153]:
# Sex
# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
age,sex = passenger
return 'child' if age < 16 else sex
mData['Person'] = mData[['Age','Sex']].apply(get_person,axis=1)
test['Person'] = test[['Age','Sex']].apply(get_person,axis=1)
# No need to use Sex column since we created Person column
mData.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)
# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic = pd.get_dummies(mData['Person'])
person_dummies_titanic.columns = ['Male','Female','Child']
person_dummies_titanic.drop(['Male'], axis=1, inplace=True)
person_dummies_test = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Male','Female','Child']
person_dummies_test.drop(['Male'], axis=1, inplace=True)
mData = mData.join(person_dummies_titanic)
test = test.join(person_dummies_test)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))
# sns.factorplot('Person',data=titanic_df,kind='count',ax=axis1)
sns.countplot(x='Person', data=mData, ax=axis1)
# average of survived for each Person(male, female, or child)
family_perc = mData[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()
sns.barplot(x='Person', y='Survived', data=family_perc, ax=axis2, order=['male','female','child'])
mData.drop(['Person'],axis=1,inplace=True)
test.drop(['Person'],axis=1,inplace=True)
In [ ]: